Exploring Market Basket Analysis in Istanbul Retail Data

Welcome to the shopping world of Istanbul! Our dataset contains shopping information from 10 different shopping malls between 2021 and 2023. The dataset includes essential information such as invoice numbers, customer IDs, age, gender, payment methods, product categories, quantity, price, order dates, and shopping mall locations.Let’s explore the dataset and discover the fascinating world of Istanbul shopping!

Load the packages

library(ggplot2)
library(dplyr)
library(plotly)
library(pacman)
p_load(ggplot2, data.table, rgl, summarytools, GGally, factoextra, cluster, tidyverse)
library(summarytools)
print("Required Packages are loaded" )
## [1] "Required Packages are loaded"

Read the data set

sales <- read.csv('C:\\Users\\LENOVO\\Downloads\\customer_shopping_data.csv') 
head(sales)
##   invoice_no customer_id gender age category quantity   price payment_method
## 1    I138884     C241288 Female  28 Clothing        5 1500.40    Credit Card
## 2    I317333     C111565   Male  21    Shoes        3 1800.51     Debit Card
## 3    I127801     C266599   Male  20 Clothing        1  300.08           Cash
## 4    I173702     C988172 Female  66    Shoes        5 3000.85    Credit Card
## 5    I337046     C189076 Female  53    Books        4   60.60           Cash
## 6    I227836     C657758 Female  28 Clothing        5 1500.40    Credit Card
##   invoice_date  shopping_mall
## 1     5/8/2022         Kanyon
## 2   12/12/2021 Forum Istanbul
## 3    9/11/2021      Metrocity
## 4   16/05/2021   Metropol AVM
## 5   24/10/2021         Kanyon
## 6   24/05/2022 Forum Istanbul

Statistics of the Dataset

summary(sales)
##   invoice_no        customer_id           gender               age       
##  Length:99457       Length:99457       Length:99457       Min.   :18.00  
##  Class :character   Class :character   Class :character   1st Qu.:30.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :43.00  
##                                                           Mean   :43.43  
##                                                           3rd Qu.:56.00  
##                                                           Max.   :69.00  
##    category            quantity         price         payment_method    
##  Length:99457       Min.   :1.000   Min.   :   5.23   Length:99457      
##  Class :character   1st Qu.:2.000   1st Qu.:  45.45   Class :character  
##  Mode  :character   Median :3.000   Median : 203.30   Mode  :character  
##                     Mean   :3.003   Mean   : 689.26                     
##                     3rd Qu.:4.000   3rd Qu.:1200.32                     
##                     Max.   :5.000   Max.   :5250.00                     
##  invoice_date       shopping_mall     
##  Length:99457       Length:99457      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
str(sales)
## 'data.frame':    99457 obs. of  10 variables:
##  $ invoice_no    : chr  "I138884" "I317333" "I127801" "I173702" ...
##  $ customer_id   : chr  "C241288" "C111565" "C266599" "C988172" ...
##  $ gender        : chr  "Female" "Male" "Male" "Female" ...
##  $ age           : int  28 21 20 66 53 28 49 32 69 60 ...
##  $ category      : chr  "Clothing" "Shoes" "Clothing" "Shoes" ...
##  $ quantity      : int  5 3 1 5 4 5 1 2 3 2 ...
##  $ price         : num  1500.4 1800.5 300.1 3000.8 60.6 ...
##  $ payment_method: chr  "Credit Card" "Debit Card" "Cash" "Credit Card" ...
##  $ invoice_date  : chr  "5/8/2022" "12/12/2021" "9/11/2021" "16/05/2021" ...
##  $ shopping_mall : chr  "Kanyon" "Forum Istanbul" "Metrocity" "Metropol AVM" ...
print(dfSummary(sales), method = 'render')

Data Frame Summary

sales

Dimensions: 99457 x 10
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 invoice_no [character]
1. I100008
2. I100014
3. I100015
4. I100024
5. I100027
6. I100028
7. I100031
8. I100033
9. I100037
10. I100041
[ 99447 others ]
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
99447(100.0%)
99457 (100.0%) 0 (0.0%)
2 customer_id [character]
1. C100004
2. C100005
3. C100006
4. C100012
5. C100019
6. C100025
7. C100028
8. C100030
9. C100034
10. C100041
[ 99447 others ]
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
1(0.0%)
99447(100.0%)
99457 (100.0%) 0 (0.0%)
3 gender [character]
1. Female
2. Male
59482(59.8%)
39975(40.2%)
99457 (100.0%) 0 (0.0%)
4 age [integer]
Mean (sd) : 43.4 (15)
min ≤ med ≤ max:
18 ≤ 43 ≤ 69
IQR (CV) : 26 (0.3)
52 distinct values 99457 (100.0%) 0 (0.0%)
5 category [character]
1. Books
2. Clothing
3. Cosmetics
4. Food & Beverage
5. Shoes
6. Souvenir
7. Technology
8. Toys
4981(5.0%)
34487(34.7%)
15097(15.2%)
14776(14.9%)
10034(10.1%)
4999(5.0%)
4996(5.0%)
10087(10.1%)
99457 (100.0%) 0 (0.0%)
6 quantity [integer]
Mean (sd) : 3 (1.4)
min ≤ med ≤ max:
1 ≤ 3 ≤ 5
IQR (CV) : 2 (0.5)
1:19767(19.9%)
2:19828(19.9%)
3:20149(20.3%)
4:19723(19.8%)
5:19990(20.1%)
99457 (100.0%) 0 (0.0%)
7 price [numeric]
Mean (sd) : 689.3 (941.2)
min ≤ med ≤ max:
5.2 ≤ 203.3 ≤ 5250
IQR (CV) : 1154.9 (1.4)
40 distinct values 99457 (100.0%) 0 (0.0%)
8 payment_method [character]
1. Cash
2. Credit Card
3. Debit Card
44447(44.7%)
34931(35.1%)
20079(20.2%)
99457 (100.0%) 0 (0.0%)
9 invoice_date [character]
1. 24/11/2021
2. 26/02/2023
3. 23/10/2022
4. 25/07/2022
5. 6/1/2021
6. 19/07/2021
7. 7/1/2023
8. 6/3/2022
9. 9/6/2022
10. 3/6/2022
[ 787 others ]
159(0.2%)
156(0.2%)
155(0.2%)
155(0.2%)
155(0.2%)
154(0.2%)
151(0.2%)
150(0.2%)
150(0.2%)
149(0.1%)
97923(98.5%)
99457 (100.0%) 0 (0.0%)
10 shopping_mall [character]
1. Cevahir AVM
2. Emaar Square Mall
3. Forum Istanbul
4. Istinye Park
5. Kanyon
6. Mall of Istanbul
7. Metrocity
8. Metropol AVM
9. Viaport Outlet
10. Zorlu Center
4991(5.0%)
4811(4.8%)
4947(5.0%)
9781(9.8%)
19823(19.9%)
19943(20.1%)
15011(15.1%)
10161(10.2%)
4914(4.9%)
5075(5.1%)
99457 (100.0%) 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-03-20

Exploratory Data Analysis

Distribution of Age

ggplot(sales, 
       aes(x = age, 
           fill =gender)) +
  geom_density(alpha = 0.5) +
  labs(title = "Distribution of Age")

Let’s know who shop more?

ggplot(sales, aes(x = gender)) + 
 geom_bar(fill = "cornflowerblue", 
           color="black") +
  labs(title = "  Total no.of shoppers", x = "Gender")

Preferred Categories by Gender

ggplot(sales, aes(x = quantity, y = price,color=category)) +
  geom_point(size=2)   + geom_smooth() + facet_grid(gender~.)

  labs(title = "Preferred Categories by Gender", x = "Quantity", y = "Price")
## $x
## [1] "Quantity"
## 
## $y
## [1] "Price"
## 
## $title
## [1] "Preferred Categories by Gender"
## 
## attr(,"class")
## [1] "labels"

Average Price for each category

ggplot(sales, aes(x = category, y = price, fill = category)) +
  geom_boxplot() +
  stat_summary(fun = "mean", geom = "point", shape = 18, size = 3, color = "white") +
  labs(title = "Average Price by Category", x = "Category", y = "Price") +
  theme(legend.position = "none")

Total Sales of each Category

# group the data by category and sum the quantity and price
category_sales <- aggregate(cbind(quantity, price) ~ category, data = sales, FUN = sum)

# calculate the total sales for each category
category_sales$total_sales <- category_sales$quantity * category_sales$price

# sort the categories by total sales (descending order)
category_sales <- category_sales[order(category_sales$total_sales, decreasing = TRUE),]

# print the top 5 categories by total sales
head(category_sales, n = 5)
##     category quantity    price  total_sales
## 2   Clothing   103558 31075685 3.218136e+12
## 5      Shoes    30217 18135337 5.479955e+11
## 7 Technology    15021 15772050 2.369120e+11
## 3  Cosmetics    45465  1848607 8.404691e+10
## 8       Toys    30321  1086705 3.294997e+10
# create an interactive scatter plot of category vs. total sales
gg <- ggplot(data = category_sales, aes(x = category, y = total_sales, text = paste("Category: ", category, "<br>Total Sales: $", total_sales))) +
  geom_point() +
  labs(title = "Category vs. Total Sales",
       x = "Category",
       y = "Total Sales") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# convert the ggplot object to a plotly object and add tooltips
ggplotly(gg, tooltip = c("text")) 

Performance of Malls across years

# Create a new column for the year
sales$year <- lubridate::year(as.Date(sales$invoice_date, "%d/%m/%Y"))

# Aggregate sales by mall and year
mall_sales <- aggregate(price ~ shopping_mall + year, data = sales, sum)

ggplot(mall_sales, aes(x = year, y = price, color = shopping_mall)) +
  geom_line() +
  labs(title = "Total Sales by Mall and Year",
       x = "Year",
       y = "Total Sales",
       color = "shopping Mall") +
  theme_bw()

Smooth Layer with regression

sales$year1 <- factor(sales$year)

ggplot(sales, aes(x = age, y = price , color=year1)) +
  geom_smooth(method = "lm") +
  labs(title = "Linear Regression Plot of Age vs. Price",
       x = "Age",
       y = "Price")

Clustering

# subset the data to only include quantity and price columns
data <- subset(sales, select = c(quantity, price))

# standardize the data to have zero mean and unit variance
data_scaled <- scale(data)

# perform k-means clustering with k = 3
set.seed(123)
kmeans_obj <- kmeans(data_scaled, centers = 3)

# add cluster labels to the original data
sales$cluster <- kmeans_obj$cluster

# create a scatter plot with ggplot
p <- ggplot(sales, aes(x = quantity, y = price, color = factor(cluster))) +
  geom_point() +
  labs(title = "K-Means Clustering Results",
       x = "Quantity",
       y = "Price",
       color = "Cluster") +
  theme_minimal()

# convert ggplot to plotly and make it interactive
p <- ggplotly(p, tooltip = c("text"))
p <- layout(p, title = "K-Means Clustering Results")
p

Clustering in the high-dimensional space of the data

 #subset the data to only include age and price columns
data <- subset(sales, select = c(age, price))

# standardize the data to have zero mean and unit variance
data_scaled <- scale(data)

# perform k-means clustering with k = 3
set.seed(123)
kmeans_obj <- kmeans(data_scaled, centers = 3)

# add cluster labels to the original data
sales$cluster <- kmeans_obj$cluster

# compute PCA for data visualization
pca <- prcomp(data_scaled)

# plot the first two principal components colored by cluster
fviz_pca_ind(pca, geom = "point", habillage = sales$cluster,
             ggtheme = theme_minimal())